# Commands to start an interactive session on the JHPCE cluster
qrsh -l mem_free=20G,h_vmem=20G
module load conda_R
cd /fastscratch/myscratch/akuo/alsf-filbin
R
library(here)
dataset = "mrna" # options = "premrna_mrna", "mrna", or "premrna"

Create SummarizedExperiment object

First, we create a table with information about where each file (quantified counts from salmon) is. This will be used to create a SummarizedExperiment object.

# list tumor names
tumor_names <- list.files(here("sample_data"))[
                  !grepl("*.txt", list.files(here("sample_data")))]

unique_sf_paths <- NULL
for(tum in tumor_names){
  ids <- list.files(here("sample_data", tum))
  ids <- unique(stringr::str_sub(ids, end=-13))
  if(!identical(ids, character(0))){
    ids <- here(paste0("salmon_quants_", dataset), paste0(ids, "_quant"), "quant.sf")
  }
  unique_sf_paths <- c(unique_sf_paths, ids)
}

unique_sf_ids <- NULL
for(tum in tumor_names){
  ids <- list.files(here("sample_data", tum))
  ids <- unique(stringr::str_sub(ids, end=-13))
  unique_sf_ids <- c(unique_sf_ids, ids)
}

coldata <- data.frame(files = unique_sf_paths, names = unique_sf_ids)

We also need to use the linkedTxome object to use tximeta properly, i.e. rowRanges(se) won’t be NULL and tximeta will be able to match the transcripts to the genes for us. Note: still doesn’t work

suppressPackageStartupMessages({
  library(tximeta)
  library(BiocFileCache)
})

# check if linkedTxome is already in the cache
bfcloc <- getTximetaBFC()
bfc <- BiocFileCache(bfcloc)
bfcinfo(bfc)

# if not, load linkedTxome json file
json_file <- here("salmon_files", "gencode.v32_salmon-index-v1.0.0.json")
loadLinkedTxome(json_file)

The only way I’ve figured out how to make this work for now is with skipMeta = TRUE.

se_file_name = here(paste0("salmon_quants_", dataset), paste0("se_", dataset, ".rds"))

# coldata = coldata[1:2, ] # for testing
if(!file.exists(se_file_name)){
  # Create SummarizedExperiment object
  if(dataset == "premrna")
    se <- tximeta(coldata, skipMeta = TRUE) # Takes a couple of minutes, file size = 800 MB
  else if (dataset == "mrna")
    se <- tximeta(coldata) # run this line if you used mRNA transcripts in your index only, it will automatically detect the right transcriptome
  # se <- tximeta(coldata, ignoreAfterBar = TRUE)
  saveRDS(se, se_file_name)
} else {
  se = readRDS(se_file_name) # Takes a couple of seconds
}
suppressPackageStartupMessages({
  library(SummarizedExperiment)
  library(DESeq2)
  library(scater)
})

colData(se)
assayNames(se)
rowRanges(se)

dat = assay(se, "counts")

Create SingleCellExperiment object

A SingleCellExperiment class is derived from the SummarizedExperiment class. The most important change is the addition of a new slot called reducedDims. Read more here.

# BiocManager::install('SingleCellExperiment')
library(SingleCellExperiment)
sce = SingleCellExperiment(assays = list(counts = dat))

# you can access counts by assay(sce, "counts") or counts(sce)
# you can add a new entry to assays slot by assay(sce, "counts_new") = dat_new

Add gene names (hgnc symbol)

# This will take a long time (> 45 min); should probably separate out as a script
tic()
sce <- getBMFeatureAnnos(sce, 
            ids = rownames(sce),
            filters = "ensembl_gene_id", 
            attributes = c("ensembl_gene_id", "hgnc_symbol",
                           "gene_biotype"),
            biomart = "ENSEMBL_MART_ENSEMBL", 
            dataset = "hsapiens_gene_ensembl",
            host = "www.ensembl.org")
toc()
rowData(sce)$gene_name_unique <- 
  uniquifyFeatureNames(rowData(sce)$gene_name_ensembl, 
                     rowData(sce)$hgnc_symbol)

Quality Control

There are a couple of QC metrics to identify low-quality cells:

  1. Using counts, i.e. cells with (a) a small library size (total sum of counts) low_lib_size or (b) few expressed endogeneous genes (nonzero counts for those genes) low_n_features
  2. Using “spike-in transcripts”, i.e. any enrichment of spike-in transcripts (higher proportion)
  3. Using the mitochondrial genome, i.e. any enrichment of reads in the mitochondrial genome is indicative of loss of cytoplasmic RNA

I will only do (1) for now.

library(scater)

# Compute quality control metrics:
# sum is the total count for each cell
# detected contains the number of detected genes (actually transcripts for our data)
df = perCellQCMetrics(sce)
df

# Find outliers with low library sizes (LibSize) and few detected features (n_features)
reasons = quickPerCellQC(df) # DataFrame of logical values
colSums(as.matrix(reasons))

# Discard outliers
filtered = sce[, !reasons$discard]
dat_filtered = counts(filtered) # 226608 x 1018 for mrna, 221988 x 1090 for premrna
sce_filtered = SingleCellExperiment(assays = list(counts = dat_filtered))

filtered_file_name = here(paste0("salmon_quants_", dataset), paste0("sce_filtered_", dataset, ".rds"))
saveRDS(sce_filtered, filtered_file_name)

Diagnostic plots: https://osca.bioconductor.org/quality-control.html#quality-control-plots

Transform read counts

library(here)
library(ggplot2)
library(dplyr)
library(tidyr)
library(SingleCellExperiment)
dataset = "mrna" # options = "premrna_mrna", "mrna", or "premrna"

Convert read counts to pseudo-UMIs

Run compute-mle.sh to get mle sig (“shape”) parameter first. There will be a parameter for every cell. It will take about 30 minutes to an hour.

# Load filtered counts
filtered_file_name = here(paste0("salmon_quants_", dataset), paste0("sce_filtered_", dataset, ".rds"))
if(file.exists(filtered_file_name)){
  sce_filtered = readRDS(filtered_file_name)
  dat_filtered = counts(sce_filtered)
}

# Read in mle parameters
mle_file_names = list.files(here(paste0("mle_results_", dataset)), full.names = TRUE)
mle_results = lapply(mle_file_names, readRDS)
sig_vec = sapply(mle_results, function(r) r["sig"])
mu_vec = sapply(mle_results, function(r) r["mu"])

# Plot sig ("shape") distribution
ggplot(tibble(sig = sig_vec), aes(x = sig)) +
  geom_histogram(bins = 30) +
  labs(title = "Distribution of sig parameter",
       x = "sig",
       y = "Number of cells")
source(here("scripts", "quminorm.R"))

# Convert to pseudo-UMIs
umi_file_name = here(paste0("salmon_quants_", dataset), paste0("sce_umi_", dataset, ".rds"))
if(file.exists(umi_file_name)){
  dat_umi = readRDS(umi_file_name)
} else {
  dat_umi = quminorm_matrix(dat_filtered, shape = median(sig_vec)) # Takes several minutes (20~30 min). In the paper, they use the mode, but I will use the median since it is not clear what mode means for continuous values. The median for mRNA is 3.16. The median for pre-mRNA is 3.28. 
  dat_umi = dat_umi[!(rowSums(dat_umi) == 0), ] # Remove transcripts with all zeros
  sce_umi = SingleCellExperiment(assays = list(counts = dat_umi))
  saveRDS(sce_umi, umi_file_name)
}

Aggregate counts at gene level

umi_file_name = here(paste0("salmon_quants_", dataset), paste0("sce_umi_", dataset, ".rds"))
sce_umi = readRDS(umi_file_name)
dat_umi = counts(sce_umi)

# Counts by transcript
dim(dat_umi) # 192459 x 1018 for mRNA, 186368 x 1090 for premRNA

# Read in mrna data to look up genes
se_mrna = readRDS(here(paste0("salmon_quants_mrna"), paste0("se_mrna.rds")))
transcripts_mrna = rownames(assay(se_mrna, "counts"))
row_ranges = rowRanges(se_mrna)
genes = unlist(elementMetadata(row_ranges)$gene_id)
mrna_gene_matching_dt = tibble(transcript = transcripts_mrna, genes)

if(dataset == "mrna"){
  matched_index = match(rownames(dat_umi), mrna_gene_matching_dt$transcript)
  dat_gene = rowsum(dat_umi, group = mrna_gene_matching_dt$genes[matched_index], reorder = FALSE)
  dim(dat_gene) # 52142 x 1018 for mRNA
} else if(dataset == "premrna") {
  # Get genes for every premrna transcript
  # 171987 in intersection of premRNA and mRNA transcripts
  # 20472 mRNA transcripts did not have an premRNA counterpart
  # 14381 premRNA transcripts did not have a mRNA counterpart
  # Note some transcripts were dropped in the pseudo-UMI conversion step (when removing transcripts with zero counts). If including those,
  # 221685 in intersection
  # 4923 mRNA transcripts did not have a premRNA counterpart
  # 303 premRNA transcripts did not have a mRNA counterpart
  transcripts_premrna = rownames(dat_umi)
  transcripts_premrna_converted = gsub(".{8}$", "" , transcripts_premrna)
  matched_index = match(transcripts_premrna_converted, mrna_gene_matching_dt$transcript)
  dat_gene = rowsum(dat_umi, group = mrna_gene_matching_dt$genes[matched_index], reorder = FALSE)
  # Remove counts for unmatched transcripts
  dat_gene = dat_gene[!is.na(rownames(dat_gene)), ]
  dim(dat_gene) # 53006 x 1090
}

gene_file_name = here(paste0("salmon_quants_", dataset), paste0("sce_gene_", dataset, ".rds"))
sce_gene = SingleCellExperiment(assays = list(counts = dat_gene))
saveRDS(sce_gene, gene_file_name)

Exploratory Plots

library(here)
library(ggplot2)
library(dplyr)
library(tidyr)
library(SingleCellExperiment)
## Warning: package 'SummarizedExperiment' was built under R version 3.6.2
## Warning: package 'S4Vectors' was built under R version 3.6.2
## Warning: package 'IRanges' was built under R version 3.6.2
## Warning: package 'DelayedArray' was built under R version 3.6.2
## Warning: package 'BiocParallel' was built under R version 3.6.2
library(scattermore)
library(tictoc)

sce_prem = readRDS(here(paste0("salmon_quants_premrna"), "sce_gene_premrna.rds"))
sce_mrna = readRDS(here(paste0("salmon_quants_mrna"), "sce_gene_mrna.rds"))

dat_prem = counts(sce_prem)
dat_mrna = counts(sce_mrna)

Compare pre-mRNA vs mRNA counts

Log-log counts

dat_prem_rowmeans = tibble(genes = rownames(dat_prem),
                           mean_prem = rowMeans(dat_prem))
dat_mrna_rowmeans = tibble(genes = rownames(dat_mrna),
                           mean_mrna = rowMeans(dat_mrna))
dat_rowmeans = full_join(dat_prem_rowmeans, dat_mrna_rowmeans, by = "genes")
tic()
dat_rowmeans %>%
  ggplot(aes(x = log(mean_mrna),
             y = log(mean_prem))) + 
  geom_point(alpha = 0.1) +
  geom_abline(intercept = 0, slope = 1, color = "red") +
  labs(x = "Log(average gene expression with mRNA)",
       y = "Log(average gene expression with pre-mRNA)") +
  theme_bw()
## Warning: Removed 3592 rows containing missing values (geom_point).

toc() # 2.5 sec
## 2.424 sec elapsed
# Smoothscatter
tic()
smoothScatter(x = log(dat_rowmeans$mean_mrna),
              y = log(dat_rowmeans$mean_prem),
              xlab = "Log(average gene expression with mRNA)",
              ylab = "Log(average gene expression with pre-mRNA)")

toc() # 1 sec
## 0.347 sec elapsed
# Scattermoreplot version
tic()
d = dat_rowmeans %>%
  mutate(log_mean_prem = log(mean_prem),
         log_mean_mrna = log(mean_mrna)) %>%
  select(log_mean_prem, log_mean_mrna)
  
d = d[complete.cases(d), ] %>% as.matrix()
scattermoreplot(d,
                main='Scattermore version')

toc() # 0.5 sec
## 0.161 sec elapsed

MA plot

MA plot is a plot of log-intensity ratios (M) vs log-intensity averages (A)

dat_rowmeans = dat_rowmeans %>%
  mutate(m = log(mean_prem) - log(mean_mrna),
         a = (log(mean_prem) + log(mean_mrna)/2))

# ggplot geom_point
dat_rowmeans %>%
  ggplot(aes(x = a,
             y = m)) + 
  geom_point(alpha = 0.1) +
  geom_smooth() +
  geom_hline(yintercept = 0, color = "red") +
  labs(x = "A",
       y = "M") +
  theme_bw()
## Warning: Removed 3592 rows containing non-finite values (stat_smooth).
## Warning: Removed 3592 rows containing missing values (geom_point).

# ggplot smoothscatter version
dat_rowmeans %>%
  ggplot(aes(x = a,
             y = m)) + 
  stat_density2d(aes(fill = ..density..^0.3), geom = "tile", contour = FALSE, n = 200) +
  scale_fill_continuous(low = "white", high = "#1C5E7E") + 
  geom_hline(yintercept = 0, color = "red") +
  geom_smooth() +
  labs(x = "A",
       y = "M") +
  theme_bw()
## Warning: Removed 3592 rows containing non-finite values (stat_density2d).
## Warning: Removed 3592 rows containing non-finite values (stat_smooth).

# smoothscatter version
smoothScatter(x = dat_rowmeans$a, y = dat_rowmeans$m,
              xlab = "A",
              ylab = "M")

Plot for one cell only.

dat_prem_subset = as_tibble(dat_prem[, 1:5]) %>%
  rename_all(function(x) paste0(x, "_premrna")) %>%
  mutate(genes = rownames(dat_prem))
dat_mrna_subset = as_tibble(dat_mrna[, 1:5]) %>%
  rename_all(function(x) paste0(x, "_mrna")) %>%
  mutate(genes = rownames(dat_mrna))
dat_subset = full_join(dat_prem_subset, dat_mrna_subset, by = "genes")
dat_subset %>%
  mutate(m = log(`BT1179Nuc-P1-A01_premrna`) - log(`BT1179Nuc-P1-A01_mrna`),
         a = (log(`BT1179Nuc-P1-A01_premrna`) + log(`BT1179Nuc-P1-A01_mrna`)/2)) %>%
  ggplot(aes(x = a,
             y = m)) + 
  stat_density2d(aes(fill = ..density..^0.3), geom = "tile", contour = FALSE, n = 200) +
  scale_fill_continuous(low = "white", high = "#1C5E7E") + 
  geom_hline(yintercept = 0, color = "red") +
  geom_smooth() +
  geom_point(alpha = 0.1) +
  labs(x = "A",
       y = "M") +
  theme_bw()
## Warning: Removed 51526 rows containing non-finite values (stat_density2d).
## Warning: Removed 51526 rows containing non-finite values (stat_smooth).
## Warning: Removed 42769 rows containing missing values (geom_point).

Comparison to parametric distributions

pre-mRNA

First, I make all the column sums the same by scaling. The count in row \(i\) and column \(j\) is transformed as \(x_{ij} = x_{ij}*\sum_i x_{ij}/median_j(\sum_i x_{ij})\).

summary(colSums(dat_prem))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1981   14825   36103   44062   61583  351921
# Apply transformation
n = round(median(colSums(dat_prem)))
dat_prem = apply(dat_prem, MARGIN = 2, function(x) x*n/sum(x))

summary(colSums(dat_prem))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   36103   36103   36103   36103   36103   36103

I plot the mean and variance for every row (transcript). Under a poisson distribution, they should be the same.

plot_meanvar = function(dat_sub){
  # estimate lambdas and variances for every transcript
  means = rowMeans(dat_sub)
  vars = apply(dat_sub, MARGIN = 1, var)
  
  # variance = mean under Poisson distribution
  tibble(means, vars) %>%
    ggplot(aes(x = log(means), y = log(vars))) +
    geom_point(alpha = 0.4) +
    geom_abline(intercept = 0, slope = 1)
}
plot_meanvar(dat_prem) +
  labs(title = "pre-mRNA")

I first compute the average expression for every row (x-axis) \(\hat{\mu}_i\) and the empirical \(P(X_i=0)\), which is the probability that for a given transcript \(i\), the count is 0.

I then compute what \(P(X_i=0)\) would be under the model assumptions of binomial or poisson, using parameters estimated from the data. In particular, for a \(Binom(n, p_i)\), \(n\) is the median number of total counts of cells and \(p_i\) is the mean proportion of counts that were in gene \(i\).

# Function to plot P(X_i = 0) against average expression level mu_i
plot_prob = function(dat_sub){
  n = round(median(colSums(dat_sub)))
  means = rowMeans(dat_sub)
  vars = apply(dat_sub, MARGIN = 1, var)
  
  # empirical P(X_i = 0)
  emp_probs_0 = apply(dat_sub, MARGIN = 1, function(r) sum(r==0)/ncol(dat_sub))
  plot_dt = tibble(means, emp_probs_0)
  
  emp_props = rowMeans(dat_sub/colSums(dat_sub))
  # Model P(X_i = 0) under Binomial
  binom_probs_0 = dbinom(x = 0, size = n, prob = emp_props)
  # Model P(X_i = 0) under Poisson
  poiss_probs_0 = dpois(x = 0, lambda = n*emp_props)
  # Model P(X_i = 0) under Negative Binomial
  # Estimate size/dispersion parameter
  model = lm(vars ~ 1*means + I(means^2) + 0, tibble(means, vars))
  phi = 1/coef(model)["I(means^2)"]
  nbinom_probs_0 = dnbinom(x = 0, size = phi, mu = n*emp_props) 
  
  # Tibble for plot
  plot_lines_dt = tibble(means = means,
                         binomial = binom_probs_0,
                         poisson = poiss_probs_0,
                         nbinomial = nbinom_probs_0) %>%
    pivot_longer(-means, names_to = "model", values_to = "probs_0")
  
  # Plot
  plt = plot_lines_dt %>%
    ggplot(aes(x = log(means), y = probs_0)) +
    geom_point(data = plot_dt, aes(x = log(means), y = emp_probs_0), alpha = 0.4) + # Add data points
    geom_line(aes(color = model),
              size = 1.5) + # Add lines for models
    labs(x = "Average expression level log(E(X_i))",
         y = "P(X_i = 0)") +
    theme(text = element_text(size = 15))
  
  # Return object
  out = list(plot = plt,
             lines_dt = plot_lines_dt)
  return(out)
}
# Plot P(X_i = 0) against average expression level
prob_out_prem = plot_prob(dat_prem)
prob_out_prem$plot +
  labs(title = "pre-mRNA")

mRNA

summary(colSums(dat_mrna))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1695   22260   47320   55303   77240  440616
# Apply transformation
n = round(median(colSums(dat_mrna)))
dat_mrna = apply(dat_mrna, MARGIN = 2, function(x) x*n/sum(x))

summary(colSums(dat_mrna))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   47320   47320   47320   47320   47320   47320
# Plot mean against variance
plot_meanvar(dat_mrna) +
  labs(title = "mRNA")

# Plot P(X_i = 0) against average expression level
prob_out_mrna = plot_prob(dat_mrna)
prob_out_mrna$plot +
  labs(title = "mRNA")

PCA

library(scran)
## Warning: package 'scran' was built under R version 3.6.2
library(scater)
library(BiocSingular)
## Warning: package 'BiocSingular' was built under R version 3.6.2
# library(factoextra)
library(tictoc)
library(glmpca)
source(here("scripts", "functions_genefilter.R"))

pre-mRNA

sce = sce_prem
dat = dat_prem
# Normalize and take log
clust = quickCluster(sce)
table(clust)
## clust
##   1   2   3   4   5   6   7 
## 118 202 169 131 163 100 207
sce = computeSumFactors(sce, clusters = clust)
sce = logNormCounts(sce)
# Filter out highly variant genes
# filterHVG(sce)
set.seed(1)
tic("approx PCA")
sce = runPCA(sce, exprs_values = "logcounts", ntop = ncol(sce), BSPARAM = IrlbaParam())
pc = list(x = reducedDim(sce, "PCA"))
toc()
## approx PCA: 2.162 sec elapsed
# set.seed(1)
# Run random PCA (2 minutes)
# tic("random PCA")
# pc = runPCA(t(X), rank = 5, BSPARAM = RandomParam())
# toc()

# Run PCA with prcomp (~10 minutes)
# X = t(X)
# nonzero_var_cols = which(apply(X, 2, var) != 0) # Remove columns (transcripts) with zero variance
# X = X[, nonzero_var_cols]
# pc = prcomp(X, center = T, scale = F)
# % variance explained by each PC
# fviz_eig(pc)
tumor_labels = gsub("-.*$", "", colnames(sce))

# Plot PC
data.frame(PC1 = pc$x[,1], PC2 = pc$x[,2], color = tumor_labels) %>%
  ggplot(aes(PC1, PC2, color = color)) +
  geom_point(alpha = 0.5) +
  theme_bw()

data.frame(PC1 = pc$x[,1], PC3 = pc$x[,3], color = tumor_labels) %>%
  ggplot(aes(PC1, PC3, color = color)) +
  geom_point(alpha = 0.5) +
  theme_bw()

data.frame(PC2 = pc$x[,2], PC3 = pc$x[,3], color = tumor_labels) %>%
  ggplot(aes(PC2, PC3, color = color)) +
  geom_point(alpha = 0.5) +
  theme_bw()

data.frame(PC4 = pc$x[,4], PC5 = pc$x[,5], color = tumor_labels) %>%
  ggplot(aes(PC4, PC5, color = color)) +
  geom_point(alpha = 0.5) +
  theme_bw()

mRNA

sce = sce_mrna
dat = dat_mrna
# Normalize and take log
clust = quickCluster(sce)
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
table(clust)
## clust
##   1   2   3   4   5 
## 184 135 239 210 250
sce = computeSumFactors(sce, clusters = clust)
sce = logNormCounts(sce)
set.seed(1)
tic("approx PCA")
sce = runPCA(sce, exprs_values = "logcounts", BSPARAM = IrlbaParam())
pc = list(x = reducedDim(sce, "PCA"))
toc()
## approx PCA: 1.632 sec elapsed
# set.seed(1)
# Run random PCA (2 minutes)
# tic("random PCA")
# pc = runPCA(t(X), rank = 5, BSPARAM = RandomParam())
# toc()

# Run PCA with prcomp (~10 minutes)
# X = t(X)
# nonzero_var_cols = which(apply(X, 2, var) != 0) # Remove columns (transcripts) with zero variance
# X = X[, nonzero_var_cols]
# pc = prcomp(X, center = T, scale = F)
# % variance explained by each PC
# fviz_eig(pc)
tumor_labels = gsub("-.*$", "", colnames(sce))

# Plot PC
data.frame(PC1 = pc$x[,1], PC2 = pc$x[,2], color = tumor_labels) %>%
  ggplot(aes(PC1, PC2, color = color)) +
  geom_point(alpha = 0.5) +
  theme_bw()

data.frame(PC1 = pc$x[,1], PC3 = pc$x[,3], color = tumor_labels) %>%
  ggplot(aes(PC1, PC3, color = color)) +
  geom_point(alpha = 0.5) +
  theme_bw()

data.frame(PC2 = pc$x[,2], PC3 = pc$x[,3], color = tumor_labels) %>%
  ggplot(aes(PC2, PC3, color = color)) +
  geom_point(alpha = 0.5) +
  theme_bw()

data.frame(PC4 = pc$x[,4], PC5 = pc$x[,5], color = tumor_labels) %>%
  ggplot(aes(PC4, PC5, color = color)) +
  geom_point(alpha = 0.5) +
  theme_bw()

GLM-PCA

pre-mRNA

sce = sce_prem
dat = dat_prem
X = dat

# Takes a long time, may want to try on cluster
# Time increases with number of rows and number of latent dimensions ("L")
tic("glm PCA")
glmpc = glmpca(X, L = 3, fam = c("poi"))
toc()
tumor_labels = gsub("-.*$", "", colnames(sce))

# Plot PC
data.frame(PC1 = glmpc$factors[,1], PC2 = glmpc$factors[,2], color = tumor_labels) %>%
  ggplot(aes(PC1, PC2, color = color)) +
  geom_point(alpha = 0.5) +
  theme_bw()

data.frame(PC1 = glmpc$factors[,1], PC3 = glmpc$factors[,3], color = tumor_labels) %>%
  ggplot(aes(PC1, PC3, color = color)) +
  geom_point(alpha = 0.5) +
  theme_bw()

data.frame(PC2 = glmpc$factors[,2], PC3 = glmpc$factors[,3], color = tumor_labels) %>%
  ggplot(aes(PC2, PC3, color = color)) +
  geom_point(alpha = 0.5) +
  theme_bw()

Clustering